% Written by Robert P. Lieli
%This version: Dec. 15, 2011
 
% Free to use and modify with proper acknowledgement. No warranties. 
%*******************************************************************************

%                         ***GENERAL DESCRIPTION***     

%The main purpose of this program is to estimate parametric binary
%classification models by the 'maximum utility' (MU) method when the training 
%data set is an endogenously stratified sample. Benchmark models are also 
%estimated by ML.   

%The code in its present form is geared towards replicating the estimation 
%results in the paper 'Closing the Gap Between Risk Estimation and 
%Decision Making: Efficient Management of Trade Related Invasive Species Risk', 
%by Lieli and Springborn, forthcoming, Review of Economics and Statistics.
%Nevertheless, it can easily be modified to implement the MU method for other 
%data sets, model specifications and utility functions.  

%Notation corresponds to the notation used in the paper above. 

%*******************************************************************************

clear all;

%                               ***INPUTS***

%*******************************************************************************
% 1. General
%*******************************************************************************

work_dir_path='D:\Matlab junk';
%Specify a directory where some auxiliary files created by this code will be saved
%(directory must already exist). E.g., the code will save all bootstrap samples
%drawn from the original sample in this directory. 

%*******************************************************************************
% 2. The raw data and the assumed base rate of weeds
%*******************************************************************************

data=load(  'weed_data2.txt'  );
K=size(data,2); %number of columns (variables)

%Structure of the data set:

%Each row represents a different species. There are 370 species altoghter:
%Weeds = 286, Non-weeds =  84

%Col. 1: Indicator of weed status (1: weed; 0: non-weed)
%Col. 2: WRA Subscore: Biogeography and history
%Col. 3: WRA Subscore: Undesirable traits
%Col. 4: WRA Subscore: Biology/Ecology

%Note: Total WRA score = Col. 2 + Col. 3 + Col. 4 

%In general, the structure of the data set should be: Col. 1=dependent
%variable coded as 0/1; Col. 2 - Col. K = covariates.

%Assumed base rate of weeds (unconditional probability that Y=1)
tau=0.02;

%*******************************************************************************
% 3. In-sample vs. out-of-sample analysis
%*******************************************************************************

%Set number of bootstrap samples (if zero, only in-sample estimation is
%performed):

R=0;

%*******************************************************************************
% 4. Model specifications 
%*******************************************************************************

%Maximum utility
%---------------

%Let X be the matrix of covariates given by the last K-1 columns of the 
%'data' matrix as defined above. 

%The model to be fitted by MU has the form m( f(X), theta ), where f(.) is a 
%suitable preliminary transformation of X and theta is a (1 x p) parameter 
%vector.

%The function m(.,.) must be specified in the m-file m1.m. The preliminary 
%transformation f(X) of X must be specified in the m-file xm1.m. 

%The preliminary transformation can be used to extract the relevant columns
%of X, add a constant term (a column of ones), possibly some nonlinear
%terms such as X.^2, create the sum of the covariates, etc. One advantage 
%of creating f(X) 'outside' the m(.,.) function is that f(X) does not have 
%to be recalculated for every evaluation of the objective function. This is 
%important since our optimization method (simulated annealing) relies on 
%a very large number of function evaluations.  

%Specify the dimension of the vector theta:
p=2;

%Specify lower bounds for each component of theta:
l_bd=-40*ones(1, p);

%Specify upper bounds for each component of theta:
u_bd=20*ones(1, p);

%Note: the simulated annealing algorithm will search in the box defined by
%l_bd and u_bd

%Maximum likelihood (logit or cuachit) 
%-------------------------------------

%Let X be the matrix of covariates given by the last K-1 columns of the 
%'data' matrix as defined above. 

%The logit/cauchit models (to be fitted by ML) can be specified as follows. 
%These models have the form Lambda( g(X)'beta ), where g(.) is a 
%suitable transform of the matrix X and beta is a parameter vector. The 
%dimensionality of beta is equal to the number of columns in g(X).

%The link function Lambda(.) must be specified in the m-file Lambda.m. 
%The preliminary transformation g(X) of X must be specified in the m-file 
%xlogit1.m. 

%Logit vs. cauchit estimation must be specified below in the section titled

    % 1. ML estimation 
    %******************

%*******************************************************************************
% 5. Utility function 
%*******************************************************************************

%Let X be the matrix of covariates given by the last K-1 columns of the 
%'data' matrix as defined above. 

%Specify the optimal cutoff function c(X) in the m-file cutoff1.m
%Specify the weight function b(X) in the m-file b1.m

%*******************************************************************************
% 6. Parameters of the simulated annealing algorithm 
%*******************************************************************************

%To set/modify other parameters of the simulated annealing algorithm,
%see the m-file MU_Est_Given_Data_Pref_Mod.m.

%The initial values of theta from which the algorithm starts searching are 
%specified below in the section titled 

    %   2. MU Estimation
    %  ******************

%The inital values are specified as the rows of a ? x p matrix; the algorithm 
%will run as many times as there are rows, i.e., initial values specified. 
%The default setting is two different initial values: 1) Start from the 
%logit/cauchit coefficient estimates; 2) Start from a random point in the 
%parameter space specified above. 


%                  ***DESCRIPTION OF SETTINGS END HERE***


%                         * START OF MAIN CODE *
 
%**************************************************************************
%                           SAVE DATA SET AGAIN
%              GENERATE AND SAVE BOOTSTRAP SAMPLES (if applicable)
%**************************************************************************

%Save basic data set under different name for unified treatment of
%in-sample vs. out-of-sample analysis

est_data =data;
eval_data=est_data;

save([work_dir_path '\EST_DATA1.TXT'], 'est_data', '-ascii', '-tabs');
save([work_dir_path '\EVAL_DATA1.TXT'], 'eval_data', '-ascii', '-tabs');

%Generate bootstrap samples
for reps=2:(R+1)

    Y=data(:,1); X=data(:,2:K);

    index_Y0=find(Y==0); N0=length(index_Y0); %non-weeds
    index_Y1=find(Y==1); N1=length(index_Y1); %weeds

    X0=X(index_Y0,:); X1=X(index_Y1,:);

    est_index0=unidrnd(N0, N0, 1); %N0 random integers from 1 to N0
    eval_index0=setdiff([1:N0]', est_index0); %data points NOT in sample used for eval

    est_index1=unidrnd(N1, N1, 1); %N1 random integers from 1 to N1
    eval_index1=setdiff([1:N1]', est_index1); %data points NOT in sample used for eval

    est_data0=[zeros(N0,1) X0(est_index0,:)];
    est_data1=[ones(N1,1) X1(est_index1,:)];

    eval_data0=[zeros(length(eval_index0),1) X0(eval_index0,:)];
    eval_data1=[ ones(length(eval_index1),1) X1(eval_index1,:)];

    eval_data=[eval_data0; eval_data1];
    est_data=[est_data0; est_data1];

    save([work_dir_path '\EVAL_DATA' num2str(reps) '.TXT'], 'eval_data', '-ascii', '-tabs');
    save([work_dir_path '\EST_DATA' num2str(reps) '.TXT'], 'est_data', '-ascii', '-tabs');

end
 
        
%**************************************************************************
%                                ESTIMATION
%**************************************************************************

%                           1. ML estimation 
%                         *******************

for rep=1:(R+1)
        
        est_data=load([work_dir_path '\EST_DATA' num2str(rep) '.TXT']);
        %Loading the estimation data set

        Y=est_data(:,1); %Y is coded as a 0 vs 1 (as opposed to -1 vs 1) variable at this point
        X=est_data(:, 2:K);
        
        X_l=x_logit1(X);
        %Creating the appropriate transformation of X to be used in
        %estimating logit model. Number of columns not necessarily K any more.
        
        %Comment out the unneeded procedure
        coeff=est_logit_rewt(Y, X_l, tau); %estimate LOGIT model
        %coeff=est_cauchit_rewt(Y, X_l, tau); %estimate CAUCHIT model
        beta(rep,:)=coeff'; 
                  
end %data cycle
        
%                          2. MU Estimation
%                         ******************

for rep=1:(R+1) %data cycle
    
    %Display data cycle counter
    disp(' ')
    rep
    disp(' ')
    
    %Loading the estimation data set
    est_data=load([work_dir_path '\EST_DATA' num2str(rep) '.TXT']);
    
    Y=est_data(:,1); 
    Y=Y-(Y==0); %Y=0 --> Y=-1
    X=est_data(:, 2:K); 
    
    %Creating the appropriate cutoffs and weights
    
    c=cutoff1(X);
    b=b1(X);
    
    %Choosing the initial value to be used in the simulated
    %annealing procedure:
    
    init_theta(1,:)=beta(rep,:); %row vector
    init_theta(2,:)=l_bd + rand(1, p).*( u_bd - l_bd );
    
    X_m=x_m1(X);
    %Creating the appropriate transformation of X to be used as an
    %input in m1.m. Number of columns not necessarily K any more.
    
    Min_Val_Loc=MU_Est_Given_Data_Pref_Mod_Rewt( Y, X_m, c, b, tau, 'm1', init_theta, p, l_bd, u_bd );
    %MU estimation given the data, preferences and the model
    %specification:
    
    S_MU(rep,1)=Min_Val_Loc(1);
    theta_MU(rep,:)=Min_Val_Loc(1,2:length(Min_Val_Loc));
    
end %data cycle

%**************************************************************************
%                EVALUATION: in-sample and out-of-sample
%**************************************************************************
    
if exist('evaluation_tables.txt')==2; delete('evaluation_tables.txt'); end

diary('evaluation_tables.txt');
%Saves what's displayed on screen in text file 

%OUTPUT TABLE IN LATEX FORMAT:
%******************************

prediction_MU_basic_rewt_comm(work_dir_path, beta, theta_MU, tau, R);

diary off

